
library(ggplot2)
require(data.table)
library("dplyr")
library("ggpubr") 
require(tidyverse) #for easy data manipulation and visualization
require(caret) #for easy machine learning workflow

wd <- "C:\\Users\\YFan\\...\\NFood_replication\\"

pft = c(17,18,  19,20,   23,24,    41,42,    61,62,   67,68,   75,76, 77,78)#rainfed vs. irrigated
crops = c("Corn", "Wheat", "Soybean", "Cotton", "Rice", "Sugarcane", "Tropical Corn", "Tropical Soybean")
cropname = c("Maize", "Wheat", "Soy", "Cotton", "Rice", "Sugarcane")

case = c("RCP45","RCP45_85LU","RCP85","RCP85_45CO2","RCP85_SAI","RCP85_MSB","RCP85_CCT")
casename = c("RCP45",expression('RCP45'[SSP5]),"RCP85",expression('RCP85'[RCP45CO2]),"RCP85+SAI","RCP85+MSB","RCP85+CCT")



#******************************************************************************
#=====================================Preparing Data===========================
#******************************************************************************
#Prepare crop yield data (modeled)
yield = read.csv(paste0(wd, "/data/yield_data/cropyield_annual.csv"), stringsAsFactors=FALSE, na.strings =c("NA"," ","9.96921e+36"))
yield = as.data.table(yield)
yield = yield[Year!=2100,] #remove year 2100, which has a sharp jump due to termination of SRMs
nrow(yield)/length(pft)/length(case) #94 years
yield[Year<2025 & Case=="RCP85_SAI",] #make sure NA values are read correctly

#save yield data per crop as named object or data.table
#add rainfed/irrigated as a dummy variable
yield6crop=yield8crop = data.table() #6/8 crops whether/not merge Tropical Corn, Tropical Soybean
for (i in 1:8) {
  rainfed = pft[seq(1,15,2)][i]
  irrigated = pft[seq(2,16,2)][i]
  yield0 = yield[PFT==rainfed,]
  yield0$Irrigated = 0 #dummy variable, 0=rainfed, 1=irrigated
  yield1 = yield[PFT==irrigated,]
  yield1$Irrigated = 1 #dummy variable, 0=rainfed, 1=irrigated
  yield.tmp = rbind(yield0, yield1)
  yield.tmp$Crop.name = crops[i] #use same crop name for rainfed and irrigated
  yield.tmp$Crop = yield.tmp$Crop.name #rename column variable
  yield.tmp[,Crop.name:=NULL] #remove column
  yield8crop = rbind(yield8crop, yield.tmp)
  if (i<=6) {yield6crop = rbind(yield6crop, yield.tmp)} 
}
unique(yield6crop$Crop)
unique(yield8crop$Crop)
#data.table sees column names as if they are variables
yield6crop[Crop=="Corn",]$Yield..MtC.year. = yield8crop[Crop=="Corn",]$Yield..MtC.year. + yield8crop[Crop=="Tropical Corn",]$Yield..MtC.year. #temperate+tropical corn
yield6crop[Crop=="Soybean",]$Yield..MtC.year. = yield8crop[Crop=="Soybean",]$Yield..MtC.year. + yield8crop[Crop=="Tropical Soybean",]$Yield..MtC.year. #temperate+tropical soybean
yield6crop[Crop=="Corn",]$Area..Mha. = yield8crop[Crop=="Corn",]$Area..Mha. + yield8crop[Crop=="Tropical Corn",]$Area..Mha. #temperate+tropical corn
yield6crop[Crop=="Soybean",]$Area..Mha. = yield8crop[Crop=="Soybean",]$Area..Mha. + yield8crop[Crop=="Tropical Soybean",]$Area..Mha. #temperate+tropical soybean
yield6crop[,PFT:=NULL]
#verify the values are correct
mean(yield8crop[PFT==17 & Case=="RCP45" & Year >=2090 & Year <= 2099]$Yield..MtC.year.) #rainfed temperate corn
mean(yield8crop[Crop=="Corn" & Case=="RCP45" & Year >=2090 & Year <= 2099]$Yield..MtC.year.) #average of rainfed and irrigated temperate corn
mean(yield6crop[Crop=="Corn" & Irrigated==0 & Case=="RCP45" & Year >=2090 & Year <= 2099]$Yield..MtC.year.) #rainfed temperate corn+tropical corn
head(yield6crop[380:400,])
nrow(yield6crop)/6/7/2 #6 crops x 7 cases x 94 years x 2 (irrigated dummy 0/1) 

#rainfed+irrigated yield 6 crops
yield.all = yield6crop[Irrigated==0,]
yield.all$Yield..MtC.year. = yield6crop[Irrigated==0,]$Yield..MtC.year. + yield6crop[Irrigated==1,]$Yield..MtC.year. 
yield.all$Area..Mha. = yield6crop[Irrigated==0,]$Area..Mha. + yield6crop[Irrigated==1,]$Area..Mha. 
yield.all[,Irrigated:=NULL] #remove 'Irrigated'
yield.all[Crop=="Corn", Crop:="Maize"]
yield.all[Crop=="Soybean", Crop:="Soy"]
mean(yield.all[Crop=="Maize" & Case=="RCP45" & Year >=2090 & Year <= 2099]$Yield..MtC.year.) #rainfed + irrigated & temperate + tropical corn



##===============compare with FAO yield data (global summary per crop type)
#Search crop items "Maize", "Wheat", "Soybeans", "Rice, paddy", and "Seed cotton" on FAOSTAT 
fao = read.csv(paste0(wd, "/data/faostat/FAOSTAT_data_6-25-2020-CountryData-latest.csv"), stringsAsFactors=FALSE, na.strings =c("NA"," "))
fao.gl = read.csv(paste0(wd, "/data/faostat/FAOSTAT_data_6-25-2020-WorldTotal-latest.csv"), stringsAsFactors=FALSE, na.strings =c("NA"," "))
fao = as.data.table(fao)
fao.gl = as.data.table(fao.gl)
fao1 = fao[Area.Code <= 299,] #All countries

#Cotton data
#Note: search item "Seed cotton" on FAOSTAT, which includes Production, Yield and Area harvested. #otherwise harvest area is not recorded for "Cotton lint" and "Cottonseed"
#production of cotton lint is more comparable to CLM modeled, but lint yield is lower, Seed cotton is the total cotton yield  
#according to FAO, A good yield of a 160 to 180 day cotton crop under irrigation is 4 to 5 ton/ha seed cotton of which 35 percent is lint.
#on http://www.fao.org/land-water/databases-and-software/crop-information/cotton/en/: Cotton (Gossypium hirsutum) is grown for fibre and seed. Present world production is about 21 million tons lint and 59.7 million tons seed cotton from about 33.98 million ha. (FAOSTAT, 2001).)

fao.gl[Element=="Yield" & Item == "Seed cotton",.(Yield=Value/10^10*10^6), by="Year"] #seed cotton= total cotton yield 
P.cotton=fao1[Element=="Production" & Item == "Cotton lint" ,.(TotProd=sum(Value,na.rm=T)/10^6), by="Year"]
A.cotton=fao1[Element=="Area harvested" & Item == "Seed cotton",.(TotArea=sum(Value,na.rm=T)/10^6), by="Year"]
cotton=P.cotton[A.cotton, on="Year"] #latest data until 2018
cotton=cotton[, AvgYield:=TotProd/TotArea] #calc global AvgYield (or use element "Yield" for world, global weighted average)
cotton$Crop="Cotton"
  

# fao.yield.avg = fao1[Element=="Yield", .(AvgYield=sum(Value,na.rm=T)), by="Item,Year"] #hg/ha, wrong! average yield should be weighted by country area
fao.area = fao1[Element=="Area harvested", .(Area=sum(Value,na.rm=T)), by="Item,Year"] #ha, world total area per crop
sum(fao1[Element=="Area harvested" & Item=="Maize" & Year==2006,]$Value, na.rm=T)*10^-6 
fao1[Element=="Area harvested" & Item=="Seed cotton", .(Area=sum(Value,na.rm=T)*10^-6 ), by="Item,Year"]
fao1[Element=="Production" & Item=="Seed cotton", .(Prod=sum(Value,na.rm=T)*10^-6 ), by="Item,Year"]
fao1[Element=="Production" & Item=="Cotton lint", .(Prod=sum(Value,na.rm=T)*10^-6 ), by="Item,Year"]


#========for other crops directly use yield and harvest area 
#First subset crop area and avg yield to new columns
Y=fao1[Element=="Yield",] #hg/ha  
Y$Value = Y$Value/10^10*10^6; Y$Unit="Mt/Mha"
A=fao1[Element=="Area harvested",] #ha
A$Value = A$Value/10^6; A$Unit="Mha" #avoid integer overflow in below caculation
#Re-join two tables Y and A on columns Area(country), Item(crop) and Year by each i (of Y) with calculation of total yield per country/crop/year
YA=Y[A, .(Prod=Value*i.Value, CropArea=i.Value), by=.EACHI, on=.(Area,Item,Year)] #Mt: Production=AvgYield*Area
#convert per country Avg yield to global Avg yield requires area weighted sum of yield
#Or directly read Production data
P=fao1[Element=="Production",] #tonnes
P$Value = P$Value/10^6; P$Unit="Mt"
PA=P[A, .(Production=Value, CropArea=i.Value), by=.EACHI, on=.(Area,Item,Year)] #join production and area
  
#first multiply per area Yield with Area harvested and then sum yield by Year and Item(crop type)
fao.yield <- YA[Year>2005,.(TotProd=sum(Prod,na.rm=T), AvgYield=sum(Prod,na.rm=T)/sum(CropArea,na.rm=T)),by="Item,Year"] #Mt (product fresh weight)
#fao.area = fao.area[Year>2005,]
#fao.yield = fao.yield[fao.area, Area:=i.Area/10^6, on=.(Item,Year)] #Join Yield and area; Mha 
#same as below:
fao.yield <- PA[Year>2005,.(TotProd=sum(Production,na.rm=T),                        #Mt (fresh weight)
                            TotArea =sum(CropArea,na.rm=T),                         #Mha
                            AvgYield=sum(Production,na.rm=T)/sum(CropArea,na.rm=T)), by="Item,Year"]  #Mt/Mha or t/ha 

fao.yield[, Crop:=Item]
fao.yield[Item=="Rice, paddy", Crop:="Rice"]
fao.yield[Item=="Soybeans", Crop:="Soy"]
fao.yield[Item=="Sugar cane", Crop:="Sugarcane"]
#fao.yield[Item=="Seed cotton", Crop:="Cotton"]
fao.yield[, Item:=NULL] #remove column item

tail(fao.yield)
fao.yield=fao.yield[Crop%in%cropname, ] 
fao.yield=rbind(fao.yield, cotton[Year>2005,]) #add cotton


unique(fao.yield$Crop)
fao.yield[, Case:="FAO"]




#========-crop yield - carbon yield conversion:
#"Maize"  "Rice"  "Soy"  "Sugarcane" "Wheat"
#dry = rep(c(0.87, 0.91, 0.92, 0.7*0.907, 0.89), each=10) #conversion to dry matter (West et al. 2010), 0.907 is correction of reported tons per unit cane yield to kg mass
#"Maize"  "Rice"  "Sugarcane" "Wheat"  "Soy"  "Cotton"
#dry = rep(c(0.87, 0.91,  0.3,  0.89, 0.92, 0.92), each=12) #West et al. 2010.Cropland carbon fluxes in the United States: increasing geospatial resolution of inventory-based carbon accounting
#note: 0.3 is the conversion ratio of fresh cane yield to dry wt from Waclawovsky et al. 2010, West et al. 2010 use 0.7
#sugarcane cover an area of 22 million hectares, very similar to SSP2/5 values (FAO, Waclawovsky et al. 2010)
#*Cane yield was converted to biomass dry matter by calculating stalk dry wt (t cane ha−1 yr−1 × 0.30) (Waclawovsky et al. 2010)
dry = data.table(Crop=c("Maize", "Rice", "Sugarcane", "Wheat", "Soy", "Cotton"), #CLM may simulate yield of sucrose rather than yield of cane, scale down by a factor 7.98
      dry=c(0.87, 0.91,  0.7/7.98,  0.89, 0.92, 0.92)) #use sucrose production for sugarcane (Tonnes cane per tonne sugar (tc/ts) is in average 7.98 after 13 months, Thangavelu 2004)

#===instead of convert fao.yield to C/ha, better convert model yield from tC/ha to tonnes dry weight/ha
fao.yield <- fao.yield[dry, AvgYield:=AvgYield*i.dry, on=.(Crop)] #fao yield data is often fresh weight
fao.yield <- fao.yield[dry, TotProd:=TotProd*i.dry, on=.(Crop)]   #convert to dry weight (and to sucrose for sugarcane)


#===convert model yield to tonnes/ha 
md.yield=yield.all[Case=="RCP85" & Year<=2018,] #Pick RCP85 or RCP45, same in 2006:2015
md.yield[Crop=="Corn", Crop:="Maize"]
md.yield[Crop=="Soybean", Crop:="Soy"]
unique(md.yield$Crop)

md.yield[Case=="RCP85", Case:="CLM5"]
md.yield[,TotProd:=Yield..MtC.year.] #MtC
md.yield[,TotArea:=Area..Mha.]
md.yield[,AvgYield:=TotProd/TotArea] #MtC/Mha
md.yield[,Yield..MtC.year.:=NULL]
md.yield[,Area..Mha.:=NULL]

#convert carbon to agricultural yield according to CLM user guide (eq. 2.26.9)
md.yield[, AvgYield:=AvgYield/0.45*0.85] #0.85 is harvest efficiency
md.yield[, TotProd:=TotProd/0.45*0.85] #0.45 is the ratio of grain C to total dry weight

#Sugarcane production is 1254.8 million ton/year cane or 55 million ton/year sucrose of 13 million ha in 2011; area is comparable to CLM's 20 million ha.
#Wheat area is much larger than fao data, possibly due to irrigated wheat (Danica Lombardozzi also show overestimation); 
#but AvgYields between md and fao are similar for wheat
#md.yield[Crop=="Wheat",]$Area = yield6crop[Irrigated==0 & Crop=="Wheat" & Year<=2015 & Case=="RCP45",]$Area..Mha. 
#md.yield[Crop=="Wheat",]$TotProd = yield6crop[Irrigated==0 & Crop=="Wheat" & Year<=2015 & Case=="RCP45",]$Yield..MtC.year.


#ratio of modeled:observed average yield
summary(md.yield[Crop=="Maize"]$AvgYield/fao.yield[Crop=="Maize"]$AvgYield)
summary(md.yield[Crop=="Wheat"]$AvgYield/fao.yield[Crop=="Wheat"]$AvgYield)
summary(md.yield[Crop=="Soy"]$AvgYield/fao.yield[Crop=="Soy"]$AvgYield)
summary(md.yield[Crop=="Rice"]$AvgYield/fao.yield[Crop=="Rice"]$AvgYield)
summary(md.yield[Crop=="Sugarcane"]$AvgYield/fao.yield[Crop=="Sugarcane"]$AvgYield)
summary(md.yield[Crop=="Cotton"]$AvgYield/fao.yield[Crop=="Cotton"]$AvgYield)

#ratio of modeled:observed total production
summary(md.yield[Crop=="Maize"]$TotProd/fao.yield[Crop=="Maize"]$TotProd)
summary(md.yield[Crop=="Wheat"]$TotProd/fao.yield[Crop=="Wheat"]$TotProd)
summary(md.yield[Crop=="Soy"]$TotProd/fao.yield[Crop=="Soy"]$TotProd)
summary(md.yield[Crop=="Rice"]$TotProd/fao.yield[Crop=="Rice"]$TotProd)
summary(md.yield[Crop=="Sugarcane"]$TotProd/fao.yield[Crop=="Sugarcane"]$TotProd)
summary(md.yield[Crop=="Cotton"]$TotProd/fao.yield[Crop=="Cotton"]$TotProd)



save(list = c("md.yield","fao.yield"), file = paste0(wd,"/data/RData/Fig2.Crop_yield_validation_FAO.RData")) #statistical prediction matches well with modeled difference





